library(needs)
needs("dplyr", "MASS", "knitr", "readr", "tidyr", "ggplot2",
"e1071", "moments", "corrplot", "Hmisc", "PerformanceAnalytics",
"mice", "car", "glmnet", "ggforce")
prioritize(dplyr)
The data set is collection The World Bank Data, the variables of interest are extracted from the raw data files and combined into a single data frame for analysis. The final data set includes:
Data imports and combining:
# helper functions
importWDI <- function(filepath, value_name) {
df <- read_csv(filepath, skip = 4)
colnames(df) <- tolower(gsub(" ", ".", colnames(df)))
df <- df %>%
pivot_longer(5:ncol(.), names_to = "year",
values_to = "value") %>%
filter(!is.null(value) & !is.na(value)) %>%
mutate(country.code = factor(country.code),
country.name = factor(country.name), year = as.numeric(year)) %>%
select(country.code, country.name, year, value)
colnames(df)[4] <- value_name
df
}
importRegionClass <- function(filepath) {
df <- read_csv(filepath, skip = 4)
colnames(df) <- tolower(gsub(" ", ".", colnames(df)))
df %>%
mutate(country.name = factor(country.name),
region = factor(region)) %>%
select(country.name, reg = region)
}
importIncomeClass <- function(filepath) {
df <- read_csv(filepath, skip = 4)
colnames(df) <- tolower(gsub(" ", ".", colnames(df)))
df %>%
pivot_longer(3:ncol(.), names_to = "year",
values_to = "income") %>%
filter(!is.null(income) & !is.na(income)) %>%
mutate(country.code = factor(country.code),
country.name = factor(country.name), year = as.numeric(year),
income = factor(income)) %>%
select(country.code, country.name, year, income)
}
# import data
setwd("../data")
poverty.headcount <- importWDI("poverty.headcount.215dollar.csv",
"pov")
mpi <- importWDI("mpi.csv", "mpi")
education.expenditure.total <- importWDI("total.education.expenditure.csv",
"edu.total")
education.expenditure.primary <- importWDI("primary.education.expenditure.csv",
"edu.pri")
education.expenditure.secondary <- importWDI("secondary.education.expenditure.csv",
"edu.sec")
education.expenditure.tertiary <- importWDI("tertiary.education.expenditure.csv",
"edu.ter")
health.expenditure <- importWDI("health.expenditure.csv",
"hlth")
military.expenditure <- importWDI("military.expenditure.csv",
"mil")
fdi <- importWDI("fdi.csv", "fdi")
labour.force.participation <- importWDI("labour.force.participation.csv",
"lbr.part")
unemployment.rate <- importWDI("unemployment.csv",
"unemp")
population.growth <- importWDI("population.growth.csv",
"pop.gwth.total")
rural.population.growth <- importWDI("rural.population.growth.csv",
"pop.gwth.rural")
urban.population.growth <- importWDI("urban.population.growth.csv",
"pop.gwth.urban")
gdp.deflator <- importWDI("gdp.deflator.csv", "gdp.dflt")
gender.equality <- importWDI("gender.equality.csv",
"gdr.eql")
gross.capital.formation <- importWDI("gross.capital.formation.csv",
"gcf")
trade <- importWDI("trade.csv", "trade")
region.class <- importRegionClass("region.class.csv")
income.class <- importIncomeClass("income.class.csv")
gdp.pc <- importWDI("gdp.pc.csv", "gdp.pc")
setwd("../src")
We found that the data sets collected from World Bank’s Data helpdesk and The World Bank’s Data have different naming convention for certain countries (e.g. “Czechia” vs. “Czechnia Republic”). So we need to rename these countries to avoid some error when joining.
Furthermore, WDI’s data sets rate also account for non-country (e.g. country.name = “Low income” or “South Asia”). These special groups are not in our scope of interest, which is national, so we eliminate them.
# using poverty.headcount as a naming standard
# (as other data from WDI also use this
# convention) join a subset of data to process
# the names
d <- poverty.headcount %>%
select(country.name) %>%
mutate(inPov = T) %>%
full_join(income.class %>%
select(country.name) %>%
mutate(inIncome = T), by = "country.name") %>%
full_join(region.class %>%
select(country.name) %>%
mutate(inReg = T), by = "country.name") %>%
mutate(inPov = !is.na(inPov), inIncome = !is.na(inIncome),
inReg = !is.na(inReg))
d
First, remove special economic groups from
poverty.headcount. We figured these regions will not appear
in income.class or region.class, so we might
find something from looking at the countries only
appear in poverty.headcount.
d %>%
filter(inPov & (!inIncome | !inReg)) %>%
distinct(country.name)
Lucky! We can look through these 18 results and compose a list of special regions.
spec.reg <- c("Fragile and conflict affected situations",
"IDA total", "World", "East Asia & Pacific", "Europe & Central Asia",
"Latin America & Caribbean", "Middle East & North Africa",
"South Asia", "Sub-Saharan Africa", "Low income",
"Low & middle income", "Lower middle income", "Upper middle income",
"High income")
Then, we rename those countries with inconsistent naming convention. Since we should only care about countries whose poverty headcount is available, reusing the list generated above, we can identify:
# mapping standard name and variation
nameMap <- tibble(standard = c("Cote d'Ivoire", "Czechia",
"Czechia", "Curacao", "Turkiye", "Turkiye", "Sao Tome and Principe"),
variation = c("Côte d'Ivoire", "Czechoslovakia",
"Czech Republic", "Curaçao", "Turkey", "Türkiye",
"São Tomé and Príncipe"))
correctName <- function(name) {
tibble(name = name) %>%
left_join(nameMap, by = c(name = "variation")) %>%
mutate(standard = ifelse(is.na(standard), name,
standard)) %>%
select(standard) %>%
pull()
}
orig.name <- c("Vietnam", "China", "Turkey", "Czechia Republic")
correctName(orig.name)
## [1] "Vietnam" "China" "Turkiye" "Czechia Republic"
Let’s test this out!
d <- poverty.headcount %>%
# correct name here
mutate(country.name = correctName(country.name)) %>%
select(country.name) %>%
mutate(inPov = T) %>%
full_join(income.class %>%
# correct name here
mutate(country.name = correctName(country.name)) %>%
select(country.name) %>%
mutate(inIncome = T), by = "country.name") %>%
full_join(region.class %>%
# correct name here
mutate(country.name = correctName(country.name)) %>%
select(country.name) %>%
mutate(inReg = T), by = "country.name") %>%
filter(!(country.name %in% spec.reg)) %>%
mutate(inPov = !is.na(inPov), inIncome = !is.na(inIncome), inReg = !is.na(inReg))
# countries not in region list, but is in Pov list
d %>%
filter(!inReg & inPov) %>%
distinct(country.name) %>%
nrow()
## [1] 0
# countries not in income list, but is in Pov list
d %>%
filter(!inIncome & inPov) %>%
distinct(country.name) %>%
nrow()
## [1] 0
We are pretty confident that there’s no inconsistent naming left unprocessed in the data sets.
# Rename countries in all data sets.
poverty.headcount <- poverty.headcount %>%
mutate(country.name = correctName(country.name))
mpi <- mpi %>%
mutate(country.name = correctName(country.name))
education.expenditure.total <- education.expenditure.total %>%
mutate(country.name = correctName(country.name))
education.expenditure.primary <- education.expenditure.primary %>%
mutate(country.name = correctName(country.name))
education.expenditure.secondary <- education.expenditure.secondary %>%
mutate(country.name = correctName(country.name))
education.expenditure.tertiary <- education.expenditure.tertiary %>%
mutate(country.name = correctName(country.name))
health.expenditure <- health.expenditure %>%
mutate(country.name = correctName(country.name))
military.expenditure <- military.expenditure %>%
mutate(country.name = correctName(country.name))
fdi <- fdi %>%
mutate(country.name = correctName(country.name))
labour.force.participation <- labour.force.participation %>%
mutate(country.name = correctName(country.name))
unemployment.rate <- unemployment.rate %>%
mutate(country.name = correctName(country.name))
population.growth <- population.growth %>%
mutate(country.name = correctName(country.name))
rural.population.growth <- rural.population.growth %>%
mutate(country.name = correctName(country.name))
urban.population.growth <- urban.population.growth %>%
mutate(country.name = correctName(country.name))
gdp.deflator <- gdp.deflator %>%
mutate(country.name = correctName(country.name))
gender.equality <- gender.equality %>%
mutate(country.name = correctName(country.name))
gross.capital.formation <- gross.capital.formation %>%
mutate(country.name = correctName(country.name))
trade <- trade %>%
mutate(country.name = correctName(country.name))
region.class <- region.class %>%
mutate(country.name = correctName(country.name))
income.class <- income.class %>%
mutate(country.name = correctName(country.name))
gdp.pc <- gdp.pc %>%
mutate(country.name = correctName(country.name))
Join the data
countries <- poverty.headcount %>%
# We used a full join here so we can conduct
# a separate analysis on mpi later
full_join(mpi, by = c("country.name", "country.code",
"year")) %>%
left_join(income.class, c("country.name", "country.code",
"year")) %>%
left_join(region.class, by = "country.name") %>%
left_join(education.expenditure.total, by = c("country.name",
"country.code", "year")) %>%
left_join(education.expenditure.primary, by = c("country.name",
"country.code", "year")) %>%
left_join(education.expenditure.secondary, by = c("country.name",
"country.code", "year")) %>%
left_join(education.expenditure.tertiary, by = c("country.name",
"country.code", "year")) %>%
left_join(health.expenditure, by = c("country.name",
"country.code", "year")) %>%
left_join(military.expenditure, by = c("country.name",
"country.code", "year")) %>%
left_join(fdi, by = c("country.name", "country.code",
"year")) %>%
left_join(labour.force.participation, by = c("country.name",
"country.code", "year")) %>%
left_join(unemployment.rate, by = c("country.name",
"country.code", "year")) %>%
left_join(population.growth, by = c("country.name",
"country.code", "year")) %>%
left_join(rural.population.growth, by = c("country.name",
"country.code", "year")) %>%
left_join(urban.population.growth, by = c("country.name",
"country.code", "year")) %>%
left_join(gdp.deflator, by = c("country.name",
"country.code", "year")) %>%
left_join(gender.equality, by = c("country.name",
"country.code", "year")) %>%
left_join(gross.capital.formation, by = c("country.name",
"country.code", "year")) %>%
left_join(trade, by = c("country.name", "country.code",
"year")) %>%
left_join(gdp.pc, by = c("country.name", "country.code",
"year")) %>%
# filter special groups
filter(!(country.name %in% spec.reg))
Data preview
head(countries)
str(countries)
## tibble [1,901 × 24] (S3: tbl_df/tbl/data.frame)
## $ country.code : Factor w/ 272 levels "AGO","ALB","ARE",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ country.name : chr [1:1901] "Angola" "Angola" "Angola" "Albania" ...
## $ year : num [1:1901] 2000 2008 2018 1996 2002 ...
## $ pov : num [1:1901] 21.4 14.6 31.1 0.5 1.1 0.6 0.2 0.6 1 0.1 ...
## $ mpi : num [1:1901] NA NA NA NA NA NA NA NA NA NA ...
## $ income : Factor w/ 4 levels "H","L","LM","UM": 2 3 3 3 3 3 3 4 4 4 ...
## $ reg : Factor w/ 7 levels "East Asia & Pacific",..: 7 7 7 2 2 2 2 2 2 2 ...
## $ edu.total : num [1:1901] 2.61 2.69 2.04 3.08 3.12 ...
## $ edu.pri : num [1:1901] NA NA NA NA NA ...
## $ edu.sec : num [1:1901] NA NA NA NA NA ...
## $ edu.ter : num [1:1901] NA NA NA NA NA ...
## $ hlth : num [1:1901] 1.91 3.32 2.54 NA 6.91 ...
## $ mil : num [1:1901] 6.39 3.57 1.87 1.38 1.32 ...
## $ fdi : num [1:1901] 8.79e+08 1.68e+09 -6.46e+09 9.01e+07 1.35e+08 ...
## $ lbr.part : num [1:1901] NA NA NA 38.8 59.6 ...
## $ unemp : num [1:1901] NA NA NA 12.3 15.8 ...
## $ pop.gwth.total: num [1:1901] 3.277 3.711 3.276 -0.622 -0.3 ...
## $ pop.gwth.rural: num [1:1901] 0.921 1.91 1.338 -1.546 -2.169 ...
## $ pop.gwth.urban: num [1:1901] 5.682 5.02 4.312 0.812 2.181 ...
## $ gdp.dflt : num [1:1901] 418.02 19.37 28.17 38.17 3.65 ...
## $ gdr.eql : num [1:1901] NA 3 NA NA NA 4 NA NA NA NA ...
## $ gcf : num [1:1901] 30.5 30.8 17.9 18.1 35.3 ...
## $ trade : num [1:1901] 152.5 121.4 66.4 44.9 68.5 ...
## $ gdp.pc : num [1:1901] 557 4081 2525 1010 1425 ...
summary(countries)
## country.code country.name year pov
## BRA : 36 Length:1901 Min. :1967 Min. : 0.00
## CRI : 34 Class :character 1st Qu.:2002 1st Qu.: 0.20
## ARG : 32 Mode :character Median :2009 Median : 1.50
## USA : 32 Mean :2007 Mean :10.04
## DEU : 30 3rd Qu.:2014 3rd Qu.:11.60
## HND : 30 Max. :2021 Max. :91.50
## (Other):1707 NA's :58
## mpi income reg edu.total
## Min. : 2.37 H :644 East Asia & Pacific :167 Min. : 1.033
## 1st Qu.:18.30 L :253 Europe & Central Asia :883 1st Qu.: 3.522
## Median :24.80 LM :501 Latin America & Caribbean :416 Median : 4.519
## Mean :27.06 UM :438 Middle East & North Africa:107 Mean : 4.582
## 3rd Qu.:33.30 NA's: 65 North America : 50 3rd Qu.: 5.457
## Max. :74.20 South Asia : 61 Max. :15.750
## NA's :1446 Sub-Saharan Africa :217 NA's :596
## edu.pri edu.sec edu.ter hlth
## Min. : 0.6578 Min. : 2.724 Min. : 0.00 Min. : 1.718
## 1st Qu.:24.0269 1st Qu.:30.138 1st Qu.:16.61 1st Qu.: 5.151
## Median :30.4730 Median :35.713 Median :20.59 Median : 6.914
## Mean :31.6633 Mean :35.630 Mean :20.96 Mean : 6.975
## 3rd Qu.:38.3324 3rd Qu.:41.380 3rd Qu.:25.14 3rd Qu.: 8.565
## Max. :70.0950 Max. :71.587 Max. :50.44 Max. :17.733
## NA's :1090 NA's :1094 NA's :963 NA's :464
## mil fdi lbr.part unemp
## Min. : 0.000 Min. :-3.444e+11 Min. :30.50 Min. : 0.250
## 1st Qu.: 1.042 1st Qu.: 2.979e+08 1st Qu.:56.17 1st Qu.: 4.513
## Median : 1.468 Median : 1.709e+09 Median :61.18 Median : 6.880
## Mean : 1.787 Mean : 1.598e+10 Mean :60.78 Mean : 8.145
## 3rd Qu.: 2.103 3rd Qu.: 9.821e+09 3rd Qu.:65.49 3rd Qu.:10.078
## Max. :19.385 Max. : 7.338e+11 Max. :93.00 Max. :49.700
## NA's :105 NA's :17 NA's :320 NA's :267
## pop.gwth.total pop.gwth.rural pop.gwth.urban gdp.dflt
## Min. :-3.6295 Min. :-8.56066 Min. :-4.078 Min. : -26.300
## 1st Qu.: 0.2656 1st Qu.:-0.85664 1st Qu.: 0.510 1st Qu.: 1.696
## Median : 1.0318 Median :-0.02461 Median : 1.484 Median : 3.865
## Mean : 1.0565 Mean : 0.00083 Mean : 1.691 Mean : 15.831
## 3rd Qu.: 1.7761 3rd Qu.: 0.96362 3rd Qu.: 2.657 3rd Qu.: 8.537
## Max. : 5.6145 Max. : 4.59686 Max. :13.805 Max. :3333.585
## NA's :14 NA's :13 NA's :18
## gdr.eql gcf trade gdp.pc
## Min. :1.500 Min. : 0.00 Min. : 1.378 Min. : 119.7
## 1st Qu.:3.000 1st Qu.:19.65 1st Qu.: 51.063 1st Qu.: 1927.9
## Median :3.500 Median :22.73 Median : 73.496 Median : 6032.1
## Mean :3.592 Mean :23.88 Mean : 84.429 Mean : 15219.5
## 3rd Qu.:4.000 3rd Qu.:26.76 3rd Qu.:105.462 3rd Qu.: 21490.4
## Max. :5.000 Max. :69.48 Max. :380.104 Max. :123678.7
## NA's :1635 NA's :72 NA's :57 NA's :8
There’s no NA in reg, which is a sign that all naming in
the data is remedied. There’s some expected NAs in income
and pov, as these data are collected by year. There’s a
substantial amount of missing data in mpi, as this is a
relative new concept. We will address the nature, and processing of
missing data in the next sections.
As observed from the summary above, the data set contains a lot of missing values in some of the variables.
mean(is.na(countries))
## [1] 0.1819656
About 19% of the data set is missing.
nCompleteObs <- sum(complete.cases(countries))
print(paste("No. of complete cases:", nCompleteObs))
## [1] "No. of complete cases: 3"
There are only 3 complete cases where all the variable is available. This is nowhere near acceptable to conduct any meaningful analysis. Therefore, we need to eliminate some variables for a more balance data set.
missings <- colMeans(is.na(countries))
ggplot(mapping = aes(x = names(missings), y = missings,
fill = missings < 0.35)) + geom_bar(stat = "identity") +
ggtitle("Missing rate") + xlab("variables") + ylab("% missing") +
theme(axis.text.x = element_text(size = 9, angle = 90))
missings[missings > 0.35]
## mpi edu.pri edu.sec edu.ter gdr.eql
## 0.7606523 0.5733824 0.5754866 0.5065755 0.8600736
There are 5 variables with missing rate >35%.
expenditure in primary, secondary, and tertiary edication can be very useful and relevant information to predict poverty reduction (Akbar et al. 2019). However, we would like to exclude these variables from some first analyses to make use of the richer set of data. We can conduct a separate analysis with these variable to gain more insight.
# variables with high missing rate
hMiss <- names(missings[missings > 0.35])
# exclude these variables in countries1
countries1 <- countries %>%
select(!hMiss) %>%
filter(!is.na(pov))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(hMiss)` instead of `hMiss` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
str(countries1)
## tibble [1,843 × 19] (S3: tbl_df/tbl/data.frame)
## $ country.code : Factor w/ 272 levels "AGO","ALB","ARE",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ country.name : chr [1:1843] "Angola" "Angola" "Angola" "Albania" ...
## $ year : num [1:1843] 2000 2008 2018 1996 2002 ...
## $ pov : num [1:1843] 21.4 14.6 31.1 0.5 1.1 0.6 0.2 0.6 1 0.1 ...
## $ income : Factor w/ 4 levels "H","L","LM","UM": 2 3 3 3 3 3 3 4 4 4 ...
## $ reg : Factor w/ 7 levels "East Asia & Pacific",..: 7 7 7 2 2 2 2 2 2 2 ...
## $ edu.total : num [1:1843] 2.61 2.69 2.04 3.08 3.12 ...
## $ hlth : num [1:1843] 1.91 3.32 2.54 NA 6.91 ...
## $ mil : num [1:1843] 6.39 3.57 1.87 1.38 1.32 ...
## $ fdi : num [1:1843] 8.79e+08 1.68e+09 -6.46e+09 9.01e+07 1.35e+08 ...
## $ lbr.part : num [1:1843] NA NA NA 38.8 59.6 ...
## $ unemp : num [1:1843] NA NA NA 12.3 15.8 ...
## $ pop.gwth.total: num [1:1843] 3.277 3.711 3.276 -0.622 -0.3 ...
## $ pop.gwth.rural: num [1:1843] 0.921 1.91 1.338 -1.546 -2.169 ...
## $ pop.gwth.urban: num [1:1843] 5.682 5.02 4.312 0.812 2.181 ...
## $ gdp.dflt : num [1:1843] 418.02 19.37 28.17 38.17 3.65 ...
## $ gcf : num [1:1843] 30.5 30.8 17.9 18.1 35.3 ...
## $ trade : num [1:1843] 152.5 121.4 66.4 44.9 68.5 ...
## $ gdp.pc : num [1:1843] 557 4081 2525 1010 1425 ...
Re-evaluate the countries1 set.
mean(is.na(countries1))
## [1] 0.05471628
sum(complete.cases(countries1))
## [1] 937
mean(complete.cases(countries1))
## [1] 0.5084102
On average, each column has 6% missing rate, results in 937 complete data point (i.e. 49%). This can be a sufficient number for the analysis. However, the missing data can induce loss of power due to the reduced sample size, and some other biases depending on which variables is missing.
# complete rate of data by regions
countries1 %>%
mutate(isComplete = complete.cases(.)) %>%
group_by(reg) %>%
summarise(complete.rate = mean(isComplete)) %>%
arrange(desc(complete.rate))
Countries from North America, Sub-Saharan Africa, and South Asia have the highest rate of missing data. We suspect that Sub-Saharan Africa, and South Asia are comparably less accessible regions. We also know that Americans don’t like filling out forms, so their high rate of missing data is understandable as well.
Still, we need to find a way to address this issue. we propose several approaches:
Distribution of the predicted variable pov
ggplot(countries, aes(x = pov)) + geom_histogram(aes(y = ..density..),
bins = 30) + geom_density(alpha = 0.2, fill = "blue")
The graph displays a decreasing rate as poverty indicator increasing.
This might not be representative of the current state of poverty in the
world, but of the number presented in our data. For example, more recent
data is likely to be more inclusive than ancient data, when poverty is
more prevalent. We should look at data from the same period.
# number of data point available from 1967 to
# 2021
ggplot(poverty.headcount, aes(x = year)) + geom_histogram(bins = 30)
# pov data from 1998 and 2018
pov.98.18 <- poverty.headcount %>%
filter(year == 1998 | year == 2018)
pov.98.18 %>%
group_by(year) %>%
summarise(sum = n())
ggplot(pov.98.18, aes(x = pov)) + geom_histogram(aes(y = ..density..),
bins = 30) + geom_density(alpha = 0.2, fill = "blue") +
facet_grid(cols = vars(year))
The graph for 1998 has a much gentler slope, meaning poverty was more popular during that time, as predicted from out intuition. What about the general progress of the world?
# Re-import pov and only take special regions
# geographic
geo.regs <- c("EAS", "ECS", "LCN", "MEA", "SAS", "SSF",
"WLD")
# economics
eco.regs <- c("HIC", "LIC", "LMC", "LMY", "UMC")
pov.reg <- importWDI("../data/poverty.headcount.215dollar.csv",
"pov") %>%
filter(country.code %in% c(geo.regs, eco.regs)) %>%
mutate(type = ifelse(country.code %in% geo.regs,
"Geographic", "Economics"))
## New names:
## Rows: 266 Columns: 67
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Country Name, Country Code, Indicator Name, Indicator Code dbl (50): 1967,
## 1969, 1971, 1974, 1975, 1977, 1978, 1979, 1980, 1981, 1982, ... lgl (13): 1960,
## 1961, 1962, 1963, 1964, 1965, 1966, 1968, 1970, 1972, 1973, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...67`
pov.reg %>%
distinct(country.code, country.name, type) %>%
arrange(type)
# World
ggplot(pov.reg %>%
filter(country.code == "WLD"), aes(x = year, y = pov)) +
geom_point() + geom_smooth(formula = y ~ x, method = loess)
An overall very steady decrease of poverty. How about each region?
ggplot(pov.reg %>%
filter(country.code != "WLD"), aes(x = year, y = pov,
color = country.name)) + geom_point() + geom_smooth(formula = y ~
x, method = loess) + facet_grid(rows = vars(type))
There’s a general steady, but distinct decline of poverty over time in each type region of respective type. Latin America & Caribbean, Europe & Central Asia, Middle East & North Africa, and High Income group has a more gradual decline as they are not very poor to begin with.
Among the income groups, Low & Middle Income, Lower & Middle Income, and Upper Middle Income have quite similar in term of poverty indicator and slope over the year. While these values vary greatly among different geographical regions.
Let’s see some important statistics
stats <- poverty.headcount %>%
summarise(count = n(), skewness = skewness(pov),
kurtosis = kurtosis(pov), std.deviation = sd(pov))
kable(stats)
| count | skewness | kurtosis | std.deviation |
|---|---|---|---|
| 2322 | 1.598182 | 1.661128 | 19.48582 |
We can conduct some preliminary checks on linearity and correlation
of predictors to have a better picture of the data. Checklist
1. Linear relationship:
Use correlation matrix to check linearity
# select only numeric data
countries.num <- countries %>%
select(where(is.numeric))
chart.Correlation(countries.num, histogram = TRUE,
pch = 19)
Which is utterly intelligible, but a majority of the fitted lines are
linear at first glance. We should render separate graphs for the
relationship between
pov & other variables.
# lengthen data table to variable-value pairs,
# with pov as predicted variable and others as
# predictive
countries.num2 <- countries.num %>%
pivot_longer(cols = 3:ncol(.), names_to = "variable",
values_to = "value") %>%
filter(!is.na(value) & !is.na(pov))
drawGraph <- function(indvar, data) {
ggplot(data %>%
filter(variable == indvar), aes(x = value,
y = pov)) + geom_point() + geom_smooth(method = lm) +
labs(title = paste("Relationship pov ~", indvar),
x = indvar, y = "pov")
}
ivs <- distinct(countries.num2, variable)[[1]]
# for (indvar in ivs) { print(
# ggplot(countries.num2 %>% filter(variable ==
# indvar), aes(x = value, y = pov)) +
# geom_point() + geom_smooth(formula = y~x,
# method = lm) + labs(title = paste('Relationship
# pov ~', indvar), x = indvar, y = 'pov') ) }
p <- ggplot(countries.num2, aes(x = value, y = pov)) +
geom_point() + geom_smooth(formula = y ~ x, method = lm) +
labs(title = paste("pov ~ variable"), x = "variable",
y = "pov") + facet_wrap_paginate(~variable,
ncol = 3, nrow = 1, scales = "free")
requiredPages <- n_pages(p)
for (i in 1:requiredPages) {
ggplot(countries.num2, aes(x = value, y = pov)) +
geom_point() + geom_smooth(formula = y ~ x,
method = lm) + labs(title = paste("pov ~ variable"),
x = "variable", y = "pov") + facet_wrap_paginate(~variable,
ncol = 3, nrow = 1, scales = "free", page = i) ->
p
print(p)
}
Most variables display linear relationship with some exceptions, which are more appropriate to assume a logarithmic relationship. Gender equality should be viewed as a categorical data.
logIvs <- c("fdi", "gdp.dflt", "gdp.pc")
for (indvar in logIvs) {
print(ggplot(countries.num2 %>%
filter(variable == indvar), aes(x = log(value),
y = pov)) + geom_point() + geom_smooth(method = lm) +
labs(title = paste0("pov ~ log(", indvar, ")"),
x = paste0("log(", indvar, ")"), y = "pov"))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
The relationship are more linear after the transformation.
gdr.eql <- countries.num2 %>%
filter(variable == "gdr.eql") %>%
mutate(value = factor(value))
ggplot(gdr.eql, aes(x = value, y = pov)) + geom_boxplot() +
labs(title = paste("pov ~ gdr.eql"), x = "gdr.eql",
y = "pov")
There’s a significant correlation between gender equality and
poverty.
Correlation coefficients between pov and predictors.
# transform some variables
countries.num3 <- countries.num %>%
mutate(lfdi = log(fdi), lgdp.dflt = log(gdp.dflt),
lgdp.pc = log(gdp.pc)) %>%
mutate(lfdi = ifelse(is.nan(lfdi) | lfdi == -Inf,
NA, lfdi))
# name of the independent variables
cn <- colnames(countries.num3)[-c(1, 2)]
# correlation of independent variable and pov
corWithPov <- function(indevar, data) {
cor(data[[indevar]], data[["pov"]], use = "complete.obs")
}
cor.pov <- sapply(cn, corWithPov, data = countries.num3)
kable(cor.pov)
| x | |
|---|---|
| mpi | 0.4036012 |
| edu.total | -0.3098112 |
| edu.pri | 0.4711269 |
| edu.sec | -0.3172901 |
| edu.ter | -0.1284782 |
| hlth | -0.2938428 |
| mil | -0.0069375 |
| fdi | -0.1441670 |
| lbr.part | 0.2316754 |
| unemp | -0.0947166 |
| pop.gwth.total | 0.5290581 |
| pop.gwth.rural | 0.4943156 |
| pop.gwth.urban | 0.5866539 |
| gdp.dflt | 0.0457111 |
| gdr.eql | -0.4505635 |
| gcf | -0.1108507 |
| trade | -0.2387164 |
| gdp.pc | -0.3773475 |
| lfdi | -0.4875988 |
| lgdp.dflt | 0.2992969 |
| lgdp.pc | -0.7053998 |
edu.ter, mil, fdi,
lbr.part, unemp, gdp.dflt,
gcf have negligible correlation with pov.
lgdp.pc, lfdi, and lgdp.dflt have
stronger linear relationship with pov than their
un-transformed counterparts.
countries1 <- countries1 %>%
mutate(lfdi = log(fdi), lgdp.dflt = log(gdp.dflt),
lgdp.pc = log(gdp.pc)) %>%
mutate(lfdi = ifelse(is.nan(lfdi) | lfdi == -Inf,
NA, lfdi))
2. Predictors are independent There should be no correlation/multicolinearity between each pair of predictors.
countries.arr <- simplify2array(countries.num %>%
select(-c("year", "pov")))
cor.mtx <- rcorr(countries.arr, type = "spearman")
round(cor.mtx$r, 2)
## mpi edu.total edu.pri edu.sec edu.ter hlth mil fdi
## mpi 1.00 -0.49 0.29 -0.10 -0.28 -0.43 0.13 -0.13
## edu.total -0.49 1.00 -0.33 0.12 0.31 0.48 -0.08 0.12
## edu.pri 0.29 -0.33 1.00 -0.62 -0.44 -0.36 0.06 -0.24
## edu.sec -0.10 0.12 -0.62 1.00 0.02 0.26 0.04 0.34
## edu.ter -0.28 0.31 -0.44 0.02 1.00 0.30 0.04 0.19
## hlth -0.43 0.48 -0.36 0.26 0.30 1.00 -0.03 0.22
## mil 0.13 -0.08 0.06 0.04 0.04 -0.03 1.00 0.07
## fdi -0.13 0.12 -0.24 0.34 0.19 0.22 0.07 1.00
## lbr.part -0.41 0.01 0.35 -0.33 -0.04 -0.11 0.02 0.08
## unemp 0.40 0.00 -0.14 0.22 0.00 0.22 0.14 -0.04
## pop.gwth.total -0.14 -0.14 0.60 -0.45 -0.11 -0.29 -0.05 -0.27
## pop.gwth.rural -0.02 -0.16 0.29 -0.18 -0.10 -0.26 0.04 -0.32
## pop.gwth.urban -0.06 -0.21 0.68 -0.51 -0.16 -0.37 -0.07 -0.27
## gdp.dflt 0.19 -0.19 0.22 -0.17 -0.15 -0.38 0.01 -0.22
## gdr.eql -0.43 0.44 -0.50 0.41 -0.21 0.39 -0.11 0.08
## gcf -0.04 -0.10 -0.12 0.01 0.08 -0.21 -0.05 0.04
## trade -0.24 0.30 -0.35 0.08 0.12 0.09 -0.34 -0.09
## gdp.pc -0.66 0.47 -0.53 0.38 0.39 0.59 -0.06 0.59
## lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## mpi -0.41 0.40 -0.14 -0.02 -0.06
## edu.total 0.01 0.00 -0.14 -0.16 -0.21
## edu.pri 0.35 -0.14 0.60 0.29 0.68
## edu.sec -0.33 0.22 -0.45 -0.18 -0.51
## edu.ter -0.04 0.00 -0.11 -0.10 -0.16
## hlth -0.11 0.22 -0.29 -0.26 -0.37
## mil 0.02 0.14 -0.05 0.04 -0.07
## fdi 0.08 -0.04 -0.27 -0.32 -0.27
## lbr.part 1.00 -0.41 0.24 0.15 0.25
## unemp -0.41 1.00 -0.27 -0.16 -0.30
## pop.gwth.total 0.24 -0.27 1.00 0.68 0.91
## pop.gwth.rural 0.15 -0.16 0.68 1.00 0.46
## pop.gwth.urban 0.25 -0.30 0.91 0.46 1.00
## gdp.dflt 0.05 -0.11 0.22 0.11 0.23
## gdr.eql -0.08 0.27 -0.58 -0.55 -0.53
## gcf 0.01 -0.17 -0.06 0.01 0.00
## trade -0.14 -0.03 -0.28 -0.13 -0.29
## gdp.pc -0.04 0.02 -0.44 -0.43 -0.50
## gdp.dflt gdr.eql gcf trade gdp.pc
## mpi 0.19 -0.43 -0.04 -0.24 -0.66
## edu.total -0.19 0.44 -0.10 0.30 0.47
## edu.pri 0.22 -0.50 -0.12 -0.35 -0.53
## edu.sec -0.17 0.41 0.01 0.08 0.38
## edu.ter -0.15 -0.21 0.08 0.12 0.39
## hlth -0.38 0.39 -0.21 0.09 0.59
## mil 0.01 -0.11 -0.05 -0.34 -0.06
## fdi -0.22 0.08 0.04 -0.09 0.59
## lbr.part 0.05 -0.08 0.01 -0.14 -0.04
## unemp -0.11 0.27 -0.17 -0.03 0.02
## pop.gwth.total 0.22 -0.58 -0.06 -0.28 -0.44
## pop.gwth.rural 0.11 -0.55 0.01 -0.13 -0.43
## pop.gwth.urban 0.23 -0.53 0.00 -0.29 -0.50
## gdp.dflt 1.00 0.18 0.09 -0.20 -0.49
## gdr.eql 0.18 1.00 0.30 0.45 0.33
## gcf 0.09 0.30 1.00 0.21 -0.01
## trade -0.20 0.45 0.21 1.00 0.24
## gdp.pc -0.49 0.33 -0.01 0.24 1.00
corrplot(cor.mtx$r, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
Some significant correlations can be found between
gdr.eql
with edu.pri, pop.gwth.total, and
fdi with mil, etc. We expect these variables
to be eliminated in the VIF test. This might be a good property for
imputation (3.2.4)
We conduct a normal linear regression, following the approaches mentioned above to address missing values issues.
1. What is imputation
Imputation is a technique to handle missing values by replacing missing
data with substitute values. In our study, we focus on “item
imputation”, which means substituting for a component of a data point
(i.e. a variable). The general idea is to take a value that preserve the
property of the data (e.g., distribution, mean, standard deviation), and
use other variables to predict the missing values (much like
regression). Since we observed some correlations during descriptive
analysis, we expect the imputation algorithm to work well.
We use Multivariate Imputation By Chained Equations (MICE) as primary tool for this technique, because of its wide acceptance in scientific studies (Alruhaymi and Kim (2021)). In order to use this technique, we have to assume that the “missingness” of a field can be explained by the values in other columns, (e.g., If the countries is in North America, it’s more likely to be missing as we have discovered in 2.2). The general ideas of the algorithm is to fill in the missing values, and improve it iteratively until predicted values converse to a stable point. The algorithmic details of MICE is very concisely (and enthrallingly) explained in Gopalan (2020).
We use relatively small parameters in the interest of time.
m = 5 imputed data setsmaxit = 30 iterationsThe imputation method is cart (Classification and
Regression Trees).
set.seed(1984)
# split data
isComplete <- which(complete.cases(countries1))
idx <- sample(isComplete, replace = F, 0.3 * nrow(countries1))
countries.train.4 <- countries1[-idx, ]
countries.test.4 <- countries1[idx, ]
# blank for no imputation, cart for variable
# requiring imputation
meth <- c(rep("", 4), "cart", "", rep("cart", 16))
names(meth) <- colnames(countries1)
meth
## country.code country.name year pov income
## "" "" "" "" "cart"
## reg edu.total hlth mil fdi
## "" "cart" "cart" "cart" "cart"
## lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## "cart" "cart" "cart" "cart" "cart"
## gdp.dflt gcf trade gdp.pc lfdi
## "cart" "cart" "cart" "cart" "cart"
## lgdp.dflt lgdp.pc
## "cart" "cart"
# run the algorithm countries1.imputed <-
# mice(countries1, m = 3, maxit = 20, method =
# meth) saveRDS(countries1.imputed,
# 'countries1.imputed.RData')
countries1.imputed <- readRDS("countries1.imputed.RData")
summary(countries1.imputed)
## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## country.code country.name year pov income
## "" "" "" "" "cart"
## reg edu.total hlth mil fdi
## "" "cart" "cart" "cart" "cart"
## lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## "cart" "cart" "" "cart" "cart"
## gdp.dflt gcf trade gdp.pc lfdi
## "cart" "cart" "cart" "cart" "cart"
## lgdp.dflt lgdp.pc
## "cart" "cart"
## PredictorMatrix:
## country.code country.name year pov income reg edu.total hlth mil
## country.code 0 0 1 1 1 1 1 1 1
## country.name 1 0 1 1 1 1 1 1 1
## year 1 0 0 1 1 1 1 1 1
## pov 1 0 1 0 1 1 1 1 1
## income 1 0 1 1 0 1 1 1 1
## reg 1 0 1 1 1 0 1 1 1
## fdi lbr.part unemp pop.gwth.total pop.gwth.rural pop.gwth.urban
## country.code 1 1 1 1 1 1
## country.name 1 1 1 1 1 1
## year 1 1 1 1 1 1
## pov 1 1 1 1 1 1
## income 1 1 1 1 1 1
## reg 1 1 1 1 1 1
## gdp.dflt gcf trade gdp.pc lfdi lgdp.dflt lgdp.pc
## country.code 1 1 1 1 1 1 1
## country.name 1 1 1 1 1 1 1
## year 1 1 1 1 1 1 1
## pov 1 1 1 1 1 1 1
## income 1 1 1 1 1 1 1
## reg 1 1 1 1 1 1 1
## Number of logged events: 961
## it im dep meth
## 1 0 0 constant
## 2 1 1 income cart
## 3 1 1 edu.total cart
## 4 1 1 hlth cart
## 5 1 1 mil cart
## 6 1 1 fdi cart
## out
## 1 country.name
## 2 country.codeEAS, country.codeECS, country.codeFCS, country.codeHIC, country.codeIDA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codeNRU, country.codeSAS, country.codeSSF, country.codeUMC, country.codeWLD, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regNorth America, regSouth Asia, regSub-Saharan Africa, edu.total
## 3 country.codeARE, country.codeBIH, country.codeCOD, country.codeDZA, country.codeEAS, country.codeECS, country.codeFCS, country.codeFSM, country.codeGUY, country.codeHIC, country.codeHTI, country.codeIDA, country.codeIRQ, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codeMKD, country.codeMNE, country.codeNGA, country.codeNRU, country.codePNG, country.codeSAS, country.codeSOM, country.codeSSF, country.codeSUR, country.codeTKM, country.codeTTO, country.codeTUV, country.codeUMC, country.codeUZB, country.codeWLD, country.codeXKX, country.codeYEM, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regMiddle East & North Africa, regNorth America, hlth, fdi
## 4 country.codeBLZ, country.codeEAS, country.codeECS, country.codeFCS, country.codeGUY, country.codeHIC, country.codeIDA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codePSE, country.codeSAS, country.codeSOM, country.codeSSD, country.codeSSF, country.codeSUR, country.codeTKM, country.codeTTO, country.codeUMC, country.codeUSA, country.codeWLD, country.codeXKX, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regNorth America, edu.total, mil
## 5 country.codeBTN, country.codeCOM, country.codeDJI, country.codeEAS, country.codeECS, country.codeFCS, country.codeFSM, country.codeHIC, country.codeHTI, country.codeIDA, country.codeISL, country.codeKIR, country.codeLCA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMDV, country.codeMEA, country.codeMHL, country.codeNRU, country.codePSE, country.codeSAS, country.codeSLB, country.codeSOM, country.codeSSF, country.codeSTP, country.codeSUR, country.codeTON, country.codeTTO, country.codeTUV, country.codeUMC, country.codeVUT, country.codeWLD, country.codeWSM, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regLatin America & Caribbean, regMiddle East & North Africa, regSub-Saharan Africa, edu.total, hlth
## 6 country.codeEAS, country.codeECS, country.codeFCS, country.codeHIC, country.codeIDA, country.codeLCN, country.codeLIC, country.codeLMC, country.codeLMY, country.codeMEA, country.codeNRU, country.codeSAS, country.codeSSF, country.codeUMC, country.codeUSA, country.codeWLD, country.codeAFG, country.codeABW, country.codeAND, country.codeANT, country.codeASM, country.codeATG, country.codeBHR, country.codeBHS, country.codeBMU, country.codeBRB, country.codeBRN, country.codeCHI, country.codeCSK, country.codeCUB, country.codeCUW, country.codeCYM, country.codeDMA, country.codeERI, country.codeFRO, country.codeGIB, country.codeGNQ, country.codeGRD, country.codeGRL, country.codeGUM, country.codeHKG, country.codeIMN, country.codeKHM, country.codeKNA, country.codeKWT, country.codeLBY, country.codeLIE, country.codeMAC, country.codeMAF, country.codeMCO, country.codeMNP, country.codeMYT, country.codeNCL, country.codeNZL, country.codeOMN, country.codePLW, country.codePRI, country.codePRK, country.codePYF, country.codeQAT, country.codeSAU, country.codeSGP, country.codeSMR, country.codeSUN, country.codeSXM, country.codeTCA, country.codeTWN, country.codeVCT, country.codeVGB, country.codeVIR, country.codeYUG, country.codeYUGf, country.codeAFE, country.codeAFW, country.codeARB, country.codeCEB, country.codeCSS, country.codeEAP, country.codeEAR, country.codeECA, country.codeEMU, country.codeEUU, country.codeHPC, country.codeIBD, country.codeIBT, country.codeIDB, country.codeIDX, country.codeLAC, country.codeLDC, country.codeLTE, country.codeMIC, country.codeMNA, country.codeNAC, country.codeOED, country.codeOSS, country.codePRE, country.codePST, country.codeSSA, country.codeSST, country.codeTEA, country.codeTEC, country.codeTLA, country.codeTMN, country.codeTSA, country.codeTSS, country.codePSS, regEurope & Central Asia, regMiddle East & North Africa, regSouth Asia, edu.total, mil
We can take a look into one of the imputated data sets.
countries1.imputed.1 <- complete(countries1.imputed,
1)
summary(countries1.imputed.1)
## country.code country.name year pov income
## BRA : 36 Length:1843 Min. :1967 Min. : 0.00 H :623
## CRI : 34 Class :character 1st Qu.:2001 1st Qu.: 0.20 L :261
## ARG : 32 Mode :character Median :2008 Median : 1.50 LM:509
## USA : 32 Mean :2007 Mean :10.04 UM:450
## HND : 30 3rd Qu.:2014 3rd Qu.:11.60
## GBR : 29 Max. :2021 Max. :91.50
## (Other):1650
## reg edu.total hlth
## East Asia & Pacific :167 Min. : 1.033 Min. : 1.718
## Europe & Central Asia :845 1st Qu.: 3.437 1st Qu.: 4.980
## Latin America & Caribbean :416 Median : 4.388 Median : 6.771
## Middle East & North Africa:104 Mean : 4.482 Mean : 6.813
## North America : 50 3rd Qu.: 5.376 3rd Qu.: 8.463
## South Asia : 53 Max. :15.750 Max. :17.733
## Sub-Saharan Africa :208
## mil fdi lbr.part unemp
## Min. : 0.000 Min. :-3.444e+11 Min. :30.50 Min. : 0.250
## 1st Qu.: 1.040 1st Qu.: 2.982e+08 1st Qu.:56.31 1st Qu.: 4.415
## Median : 1.465 Median : 1.702e+09 Median :61.46 Median : 6.770
## Mean : 1.784 Mean : 1.613e+10 Mean :61.14 Mean : 8.083
## 3rd Qu.: 2.100 3rd Qu.: 9.793e+09 3rd Qu.:65.93 3rd Qu.:10.075
## Max. :19.385 Max. : 7.338e+11 Max. :93.00 Max. :49.700
##
## pop.gwth.total pop.gwth.rural pop.gwth.urban gdp.dflt
## Min. :-3.6295 Min. :-8.560655 Min. :-4.078 Min. : -26.300
## 1st Qu.: 0.2836 1st Qu.:-0.846827 1st Qu.: 0.517 1st Qu.: 1.734
## Median : 1.0378 Median :-0.021085 Median : 1.485 Median : 3.890
## Mean : 1.0639 Mean : 0.005301 Mean : 1.692 Mean : 16.129
## 3rd Qu.: 1.7764 3rd Qu.: 0.959723 3rd Qu.: 2.652 3rd Qu.: 8.606
## Max. : 5.6145 Max. : 4.596858 Max. :13.805 Max. :3333.585
##
## gcf trade gdp.pc lfdi
## Min. : 0.00 Min. : 1.378 Min. : 119.7 Min. : 6.908
## 1st Qu.:19.63 1st Qu.: 50.622 1st Qu.: 1904.4 1st Qu.:19.513
## Median :22.71 Median : 72.606 Median : 5913.4 Median :21.255
## Mean :23.89 Mean : 83.509 Mean : 14983.6 Mean :20.767
## 3rd Qu.:26.78 3rd Qu.:104.706 3rd Qu.: 20863.0 3rd Qu.:23.008
## Max. :69.48 Max. :380.104 Max. :123678.7 Max. :27.322
##
## lgdp.dflt lgdp.pc
## Min. :-2.9224 Min. : 4.785
## 1st Qu.: 0.5504 1st Qu.: 7.552
## Median : 1.3584 Median : 8.685
## Mean : 1.2262 Mean : 8.666
## 3rd Qu.: 2.1506 3rd Qu.: 9.946
## Max. : 8.1118 Max. :11.725
##
sum(!complete.cases(countries1.imputed.1))
## [1] 0
We found no missing cases as expected.
We generated 3 sets of imputated (training) data. We can build separate models using each data set, and combine the estimates using pooling rule. With all the generated data sets.
formula.str <- "pov ~ country.code+year+income+edu.total+hlth+mil+fdi+lbr.part+unemp+pop.gwth.total+pop.gwth.rural+pop.gwth.urban+gdp.dflt+gcf+trade+gdp.pc+lfdi+lgdp.dflt+lgdp.pc"
# Using single imputed data set
fit1 <- lapply(1:countries1.imputed$m, function(i) lm(as.formula(formula.str),
data = complete(countries1.imputed, i)))
# Build models with all generated data set and
# pool the estimates
fit2 <- with(data = countries1.imputed, exp = lm(as.formula(formula.str)))
fit2.combined <- pool(fit2)
# dummy lm model
fit2.dummy <- lm(as.formula(formula.str), data = complete(countries1.imputed,
1))
# replace coefficients of dummy model
name.coef <- names(fit2.dummy$coefficients)
fit2.dummy$coefficients <- fit2.combined$pooled$estimate
names(fit2.dummy$coefficients) <- name.coef
Helper function to calculate R-Squared
r2 <- function(pred, orig) {
RSS <- sum((pred - orig)^2)
TSS <- sum((orig - mean(orig))^2)
R2 <- 1 - RSS/TSS
return(R2)
}
adjR2 <- function(pred, orig, k) {
R2 <- r2(pred, orig)
n <- length(pred)
adjr2 <- 1 - (1 - R2) * (n - 1)/(n - k - 1)
return(adjr2)
}
Adjusted R-Squared on train and test
fit1.summ <- lapply(fit1, function(f) {
# number of variables
k <- length(f$coefficients) - 1
# predict value of test
pred <- predict(f, countries.test.4)
test.resid <- pred - countries.test.4$pov
data.frame(`Train MSE` = mean(summary(f)$residuals^2),
`Test MSE` = mean(test.resid^2), `Train MAE` = mean(abs(summary(f)$residuals)),
`Test MAE` = mean(abs(test.resid)), `Train Adj.R2` = summary(f)$adj.r.squared,
`Test Adj.R2` = adjR2(pred, countries.test.4$pov,
k))
})
# number of variables
k <- length(fit2.dummy$coefficients) - 1
# number of variables
fit2.pred <- predict(fit2.dummy, countries.test.4)
test.resid <- fit2.pred - countries.test.4$pov
fit2.summ <- data.frame(`Train MSE` = mean(summary(fit2.dummy)$residuals^2),
`Test MSE` = mean(test.resid^2), `Train MAE` = mean(abs(summary(fit2.dummy)$residuals)),
`Test MAE` = mean(abs(test.resid)), `Train Adj.R2` = pool.r.squared(fit2,
adjusted = T)[, "est"], `Test Adj.R2` = adjR2(fit2.pred,
countries.test.4$pov, k))
res <- do.call(rbind.data.frame, fit1.summ)
res <- rbind(res, fit2.summ)
res <- cbind(data.frame(Set = c(1:3, "Pooled")), res)
kable(res)
| Set | Train.MSE | Test.MSE | Train.MAE | Test.MAE | Train.Adj.R2 | Test.Adj.R2 |
|---|---|---|---|---|---|---|
| 1 | 25.12444 | 12.03457 | 3.060598 | 2.125496 | 0.9090249 | 0.7000035 |
| 2 | 26.22526 | 11.65027 | 3.089747 | 2.084970 | 0.9050389 | 0.7095833 |
| 3 | 26.67063 | 11.67330 | 3.097355 | 2.076761 | 0.9034262 | 0.7090092 |
| Pooled | 25.12444 | 11.71770 | 3.060598 | 2.088330 | 0.9058581 | 0.7079023 |
Pooled model slightly improve performance. Test data fitting has better MSE and MAE than train data. However, Adjusted R-Squared is lower in test data. There might be some over-fitting in our models.
# find coef of variable other than country.code
var <- c("year", "incomeL", "incomeLM", "incomeUM",
"edu.total", "hlth", "mil", "fdi", "lbr.part",
"unemp", "pop.gwth.total", "pop.gwth.rural", "pop.gwth.urban",
"gdp.dflt", "gcf", "trade", "gdp.pc", "lfdi", "lgdp.dflt",
"lgdp.pc")
res <- do.call(rbind, lapply(fit1, function(f) f$coefficients[var]))
res <- rbind(res, fit2.dummy$coefficients[var])
res <- cbind(data.frame(Set = c(1:3, "Pooled")), res)
kable(res)
| Set | year | incomeL | incomeLM | incomeUM | edu.total | hlth | mil | fdi | lbr.part | unemp | pop.gwth.total | pop.gwth.rural | pop.gwth.urban | gdp.dflt | gcf | trade | gdp.pc | lfdi | lgdp.dflt | lgdp.pc |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | -0.1793595 | -1.1220814 | -6.015848 | -3.267545 | -0.4937974 | 0.3315404 | -0.2223303 | 0 | 0.2373374 | -0.0629923 | -3.117747 | 0.6814172 | 2.365014 | 0.0002297 | -0.1865163 | -0.0146545 | 0.0003003 | -0.0001849 | 0.0333216 | -8.181891 |
| 2 | -0.2388829 | -0.6675004 | -5.058398 | -2.728803 | -0.4452778 | 0.4565476 | -0.3394690 | 0 | 0.2041507 | -0.0683368 | -3.048303 | 0.6404098 | 2.335421 | 0.0004350 | -0.2286906 | -0.0047090 | 0.0002796 | 0.0003921 | 0.1156346 | -6.720234 |
| 3 | -0.2178625 | 0.0956525 | -5.415334 | -2.903567 | -0.5451318 | 0.2542830 | -0.0866469 | 0 | 0.1360500 | -0.0821195 | -3.329652 | 0.7158451 | 2.459798 | -0.0001378 | -0.1883574 | -0.0081599 | 0.0003002 | 0.0956085 | 0.0381381 | -7.121499 |
| Pooled | -0.2120350 | -0.5646431 | -5.496527 | -2.966638 | -0.4947357 | 0.3474570 | -0.2161487 | 0 | 0.1925127 | -0.0711495 | -3.165234 | 0.6792240 | 2.386744 | 0.0001756 | -0.2011881 | -0.0091745 | 0.0002934 | 0.0319386 | 0.0623648 | -7.341208 |
In contradiction to expectation, hlth is positively
correlated with pov, and lower-middle income countries
(incomeLM) are less poor than high income countries (incomeH).
pop.gwth.total is negatively related to pov,
while pop.gwth.rural and pop.gwth.urban are
positively related. They might be indications of over-fitting model.
# fit model on imputated set no. 1
summary(fit1[[1]])
##
## Call:
## lm(formula = as.formula(formula.str), data = complete(countries1.imputed,
## i))
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.335 -1.717 0.106 1.632 32.720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.405e+02 6.202e+01 7.102 1.81e-12 ***
## country.codeALB -1.816e+01 3.752e+00 -4.840 1.42e-06 ***
## country.codeARE -1.174e+01 5.213e+00 -2.253 0.024402 *
## country.codeARG -1.432e+01 3.576e+00 -4.004 6.52e-05 ***
## country.codeARM -1.593e+01 3.573e+00 -4.459 8.77e-06 ***
## country.codeAUS -1.533e+01 3.974e+00 -3.857 0.000119 ***
## country.codeAUT -1.375e+01 3.985e+00 -3.450 0.000575 ***
## country.codeAZE -2.666e+01 3.925e+00 -6.792 1.53e-11 ***
## country.codeBDI 1.512e+01 4.355e+00 3.472 0.000530 ***
## country.codeBEL -9.960e+00 3.976e+00 -2.505 0.012338 *
## country.codeBEN 8.375e+00 4.199e+00 1.995 0.046252 *
## country.codeBFA 9.856e+00 3.978e+00 2.477 0.013333 *
## country.codeBGD -1.662e+01 3.767e+00 -4.411 1.10e-05 ***
## country.codeBGR -9.817e+00 3.755e+00 -2.614 0.009028 **
## country.codeBIH -1.396e+01 4.349e+00 -3.209 0.001358 **
## country.codeBLR -1.373e+01 3.608e+00 -3.805 0.000147 ***
## country.codeBLZ 1.622e+00 3.957e+00 0.410 0.681965
## country.codeBOL -1.448e+01 3.460e+00 -4.185 3.00e-05 ***
## country.codeBRA -7.943e+00 3.491e+00 -2.275 0.023003 *
## country.codeBTN -1.360e+01 4.246e+00 -3.203 0.001388 **
## country.codeBWA 3.192e+00 4.061e+00 0.786 0.431903
## country.codeCAF 3.208e+01 4.956e+00 6.473 1.26e-10 ***
## country.codeCAN -1.584e+01 3.899e+00 -4.062 5.09e-05 ***
## country.codeCHE -2.027e+01 4.219e+00 -4.804 1.69e-06 ***
## country.codeCHL -1.138e+01 3.609e+00 -3.153 0.001643 **
## country.codeCHN -3.392e+00 3.676e+00 -0.923 0.356314
## country.codeCIV -1.097e+01 3.596e+00 -3.050 0.002324 **
## country.codeCMR -2.086e+00 4.171e+00 -0.500 0.617037
## country.codeCOD 3.261e+01 5.009e+00 6.510 9.94e-11 ***
## country.codeCOG 2.285e+01 4.896e+00 4.667 3.31e-06 ***
## country.codeCOL -8.338e+00 3.468e+00 -2.404 0.016310 *
## country.codeCOM -1.041e+01 4.951e+00 -2.104 0.035569 *
## country.codeCPV -5.640e+00 4.472e+00 -1.261 0.207485
## country.codeCRI -1.395e+01 3.490e+00 -3.998 6.66e-05 ***
## country.codeCYP -1.151e+01 3.815e+00 -3.017 0.002590 **
## country.codeCZE -1.197e+01 3.746e+00 -3.195 0.001423 **
## country.codeDEU -1.508e+01 3.953e+00 -3.816 0.000141 ***
## country.codeDJI 6.445e+00 4.265e+00 1.511 0.130905
## country.codeDNK -1.524e+01 4.075e+00 -3.740 0.000191 ***
## country.codeDOM -1.348e+01 3.461e+00 -3.894 0.000103 ***
## country.codeDZA -1.533e+01 4.460e+00 -3.438 0.000600 ***
## country.codeECU -9.712e+00 3.446e+00 -2.818 0.004891 **
## country.codeEGY -1.813e+01 3.668e+00 -4.944 8.45e-07 ***
## country.codeESP -1.224e+01 3.768e+00 -3.247 0.001188 **
## country.codeEST -9.836e+00 3.806e+00 -2.584 0.009849 **
## country.codeETH -7.996e+00 4.107e+00 -1.947 0.051699 .
## country.codeFIN -1.376e+01 3.958e+00 -3.476 0.000523 ***
## country.codeFJI -1.390e+01 4.229e+00 -3.287 0.001034 **
## country.codeFRA -1.388e+01 3.910e+00 -3.550 0.000397 ***
## country.codeFSM -5.263e+00 5.172e+00 -1.018 0.309035
## country.codeGAB -8.619e+00 4.951e+00 -1.741 0.081899 .
## country.codeGBR -1.621e+01 3.855e+00 -4.205 2.75e-05 ***
## country.codeGEO -1.156e+01 3.606e+00 -3.205 0.001379 **
## country.codeGHA 9.149e+00 3.816e+00 2.397 0.016621 *
## country.codeGIN 3.754e+00 3.865e+00 0.971 0.331520
## country.codeGMB 9.848e-01 4.182e+00 0.235 0.813864
## country.codeGNB 1.064e+00 4.019e+00 0.265 0.791274
## country.codeGRC -1.146e+01 3.809e+00 -3.009 0.002664 **
## country.codeGTM -6.416e+00 3.894e+00 -1.648 0.099560 .
## country.codeGUY -6.993e+00 5.223e+00 -1.339 0.180797
## country.codeHND -6.040e+00 3.353e+00 -1.801 0.071809 .
## country.codeHRV -1.128e+01 3.891e+00 -2.898 0.003802 **
## country.codeHTI -2.479e+00 6.230e+00 -0.398 0.690809
## country.codeHUN -1.008e+01 3.812e+00 -2.645 0.008247 **
## country.codeIDN 1.096e+00 3.397e+00 0.323 0.747101
## country.codeIND 5.791e-01 3.606e+00 0.161 0.872414
## country.codeIRL -1.440e+01 3.979e+00 -3.618 0.000305 ***
## country.codeIRN -1.088e+01 3.556e+00 -3.060 0.002251 **
## country.codeIRQ -1.380e+01 4.920e+00 -2.805 0.005086 **
## country.codeISL -1.914e+01 4.067e+00 -4.706 2.73e-06 ***
## country.codeISR -1.203e+01 3.739e+00 -3.217 0.001320 **
## country.codeITA -1.177e+01 3.869e+00 -3.043 0.002381 **
## country.codeJAM -1.735e+01 3.890e+00 -4.461 8.70e-06 ***
## country.codeJOR -1.417e+01 3.884e+00 -3.649 0.000271 ***
## country.codeJPN -1.244e+01 5.096e+00 -2.442 0.014703 *
## country.codeKAZ -1.408e+01 3.524e+00 -3.995 6.75e-05 ***
## country.codeKEN -1.160e+01 4.018e+00 -2.886 0.003947 **
## country.codeKGZ -2.110e+01 3.451e+00 -6.115 1.20e-09 ***
## country.codeKIR -1.680e+01 5.025e+00 -3.344 0.000846 ***
## country.codeKOR -1.048e+01 4.175e+00 -2.510 0.012163 *
## country.codeLAO -1.647e+01 3.912e+00 -4.209 2.70e-05 ***
## country.codeLBN -1.119e+01 6.227e+00 -1.796 0.072603 .
## country.codeLBR -6.869e+00 4.557e+00 -1.507 0.131922
## country.codeLCA 8.071e+00 5.044e+00 1.600 0.109786
## country.codeLKA -1.775e+01 3.745e+00 -4.740 2.32e-06 ***
## country.codeLSO 1.014e+01 4.306e+00 2.356 0.018602 *
## country.codeLTU -1.022e+01 3.885e+00 -2.630 0.008618 **
## country.codeLUX -1.900e+01 4.554e+00 -4.171 3.18e-05 ***
## country.codeLVA -9.501e+00 3.843e+00 -2.472 0.013531 *
## country.codeMAR -1.495e+01 3.884e+00 -3.849 0.000123 ***
## country.codeMDA -1.260e+01 3.655e+00 -3.449 0.000577 ***
## country.codeMDG 2.203e+01 3.793e+00 5.809 7.54e-09 ***
## country.codeMDV -1.038e+01 4.226e+00 -2.456 0.014169 *
## country.codeMEX -8.813e+00 3.603e+00 -2.446 0.014552 *
## country.codeMHL -8.443e+00 6.649e+00 -1.270 0.204346
## country.codeMKD -6.353e+00 3.804e+00 -1.670 0.095058 .
## country.codeMLI 5.767e+00 4.036e+00 1.429 0.153226
## country.codeMLT -7.505e+00 4.209e+00 -1.783 0.074766 .
## country.codeMMR -2.049e+01 5.019e+00 -4.081 4.69e-05 ***
## country.codeMNE -9.307e+00 3.940e+00 -2.362 0.018274 *
## country.codeMNG -1.241e+01 3.621e+00 -3.427 0.000626 ***
## country.codeMOZ 3.315e+01 4.226e+00 7.845 7.68e-15 ***
## country.codeMRT -7.788e+00 3.753e+00 -2.075 0.038141 *
## country.codeMUS -1.159e+01 4.590e+00 -2.525 0.011664 *
## country.codeMWI 1.959e+01 4.109e+00 4.767 2.03e-06 ***
## country.codeMYS -1.630e+01 3.657e+00 -4.458 8.83e-06 ***
## country.codeNAM 4.837e+00 4.565e+00 1.060 0.289437
## country.codeNER 2.781e+01 3.890e+00 7.148 1.32e-12 ***
## country.codeNGA 7.590e+00 3.764e+00 2.017 0.043888 *
## country.codeNIC -1.255e+01 3.906e+00 -3.214 0.001334 **
## country.codeNLD -1.323e+01 4.078e+00 -3.245 0.001199 **
## country.codeNOR -1.998e+01 4.160e+00 -4.804 1.70e-06 ***
## country.codeNPL -1.628e+01 4.602e+00 -3.537 0.000416 ***
## country.codeNRU -9.911e+00 6.312e+00 -1.570 0.116570
## country.codePAK -8.706e+00 3.542e+00 -2.458 0.014064 *
## country.codePAN -6.792e+00 3.494e+00 -1.944 0.052037 .
## country.codePER -1.059e+01 3.487e+00 -3.036 0.002435 **
## country.codePHL -1.152e+01 3.802e+00 -3.030 0.002486 **
## country.codePNG 2.238e+01 4.985e+00 4.490 7.61e-06 ***
## country.codePOL -1.127e+01 3.747e+00 -3.009 0.002658 **
## country.codePRT -1.476e+01 3.859e+00 -3.825 0.000136 ***
## country.codePRY -1.706e+01 3.422e+00 -4.984 6.87e-07 ***
## country.codePSE -1.405e+01 3.720e+00 -3.777 0.000164 ***
## country.codeROU -6.286e+00 3.756e+00 -1.674 0.094394 .
## country.codeRUS -1.617e+01 3.580e+00 -4.515 6.76e-06 ***
## country.codeRWA 1.790e+01 4.076e+00 4.391 1.20e-05 ***
## country.codeSDN -3.057e+00 4.952e+00 -0.617 0.537071
## country.codeSEN 1.463e+01 3.852e+00 3.798 0.000151 ***
## country.codeSLB 5.575e+00 4.968e+00 1.122 0.261913
## country.codeSLE 4.242e+00 4.270e+00 0.993 0.320678
## country.codeSLV -1.324e+01 3.497e+00 -3.787 0.000158 ***
## country.codeSOM 3.135e+01 6.245e+00 5.019 5.75e-07 ***
## country.codeSRB -7.874e+00 3.925e+00 -2.006 0.045006 *
## country.codeSSD 1.749e+01 4.971e+00 3.518 0.000446 ***
## country.codeSTP -1.980e-01 4.498e+00 -0.044 0.964886
## country.codeSUR 2.164e+00 6.245e+00 0.347 0.728999
## country.codeSVK -1.022e+01 3.850e+00 -2.655 0.008008 **
## country.codeSVN -1.203e+01 3.868e+00 -3.109 0.001911 **
## country.codeSWE -1.573e+01 3.959e+00 -3.974 7.38e-05 ***
## country.codeSWZ 3.383e+01 4.222e+00 8.013 2.10e-15 ***
## country.codeSYC -8.223e+00 5.079e+00 -1.619 0.105593
## country.codeSYR -1.185e+01 4.958e+00 -2.390 0.016964 *
## country.codeTCD 8.186e+00 4.464e+00 1.834 0.066867 .
## country.codeTGO 1.183e+01 4.205e+00 2.814 0.004957 **
## country.codeTHA -2.016e+01 3.507e+00 -5.748 1.07e-08 ***
## country.codeTJK -1.038e+01 3.911e+00 -2.655 0.008016 **
## country.codeTKM 9.976e+00 6.256e+00 1.595 0.110995
## country.codeTLS -1.487e+01 4.481e+00 -3.318 0.000926 ***
## country.codeTON -1.471e+01 4.499e+00 -3.268 0.001104 **
## country.codeTTO -2.031e+01 5.128e+00 -3.960 7.82e-05 ***
## country.codeTUN -1.285e+01 3.816e+00 -3.368 0.000776 ***
## country.codeTUR -9.876e+00 3.467e+00 -2.848 0.004449 **
## country.codeTUV -1.877e+01 6.456e+00 -2.907 0.003701 **
## country.codeTZA 1.412e+01 4.094e+00 3.449 0.000576 ***
## country.codeUGA 4.779e+00 3.769e+00 1.268 0.204900
## country.codeUKR -1.878e+01 3.577e+00 -5.249 1.73e-07 ***
## country.codeURY -1.355e+01 3.749e+00 -3.614 0.000310 ***
## country.codeUSA -1.736e+01 4.039e+00 -4.298 1.82e-05 ***
## country.codeUZB 3.838e+01 4.243e+00 9.045 < 2e-16 ***
## country.codeVEN -1.015e+01 3.629e+00 -2.797 0.005219 **
## country.codeVNM -2.062e+01 3.682e+00 -5.600 2.51e-08 ***
## country.codeVUT -3.411e+00 4.894e+00 -0.697 0.485889
## country.codeWSM -1.279e+01 4.522e+00 -2.828 0.004746 **
## country.codeXKX -7.618e+00 3.807e+00 -2.001 0.045571 *
## country.codeYEM -1.656e+01 4.440e+00 -3.730 0.000198 ***
## country.codeZAF 8.327e+00 3.996e+00 2.084 0.037354 *
## country.codeZMB 2.172e+01 3.652e+00 5.947 3.33e-09 ***
## country.codeZWE 1.661e+00 4.491e+00 0.370 0.711550
## year -1.794e-01 3.171e-02 -5.657 1.81e-08 ***
## incomeL -1.122e+00 1.423e+00 -0.788 0.430540
## incomeLM -6.016e+00 1.015e+00 -5.928 3.73e-09 ***
## incomeUM -3.268e+00 7.499e-01 -4.357 1.40e-05 ***
## edu.total -4.938e-01 1.368e-01 -3.611 0.000314 ***
## hlth 3.315e-01 1.172e-01 2.830 0.004711 **
## mil -2.223e-01 1.882e-01 -1.181 0.237744
## fdi -4.910e-12 3.855e-12 -1.274 0.202949
## lbr.part 2.373e-01 2.609e-02 9.095 < 2e-16 ***
## unemp -6.299e-02 4.080e-02 -1.544 0.122802
## pop.gwth.total -3.118e+00 6.542e-01 -4.766 2.05e-06 ***
## pop.gwth.rural 6.814e-01 2.679e-01 2.544 0.011049 *
## pop.gwth.urban 2.365e+00 3.553e-01 6.656 3.81e-11 ***
## gdp.dflt 2.297e-04 1.101e-03 0.209 0.834695
## gcf -1.865e-01 2.739e-02 -6.810 1.36e-11 ***
## trade -1.465e-02 8.505e-03 -1.723 0.085064 .
## gdp.pc 3.003e-04 2.353e-05 12.765 < 2e-16 ***
## lfdi -1.849e-04 5.216e-02 -0.004 0.997171
## lgdp.dflt 3.332e-02 1.183e-01 0.282 0.778272
## lgdp.pc -8.182e+00 5.402e-01 -15.147 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.289 on 1655 degrees of freedom
## Multiple R-squared: 0.9183, Adjusted R-squared: 0.909
## F-statistic: 99.42 on 187 and 1655 DF, p-value: < 2.2e-16
There are also some weak variable, we should perform some variable selection.
Our data set contains a lots of variables. We can perform some variable selection to reduce over-fitting.
gvif <- lapply(fit1, vif)
gvif
## [[1]]
## GVIF Df GVIF^(1/(2*Df))
## country.code 6.597280e+08 167 1.062687
## year 5.678340e+00 1 2.382927
## income 1.085865e+02 3 2.184218
## edu.total 3.073906e+00 1 1.753256
## hlth 5.712620e+00 1 2.390109
## mil 4.214553e+00 1 2.052938
## fdi 2.533193e+00 1 1.591601
## lbr.part 3.472433e+00 1 1.863447
## unemp 3.505342e+00 1 1.872256
## pop.gwth.total 3.557234e+01 1 5.964255
## pop.gwth.rural 1.022705e+01 1 3.197977
## pop.gwth.urban 2.210527e+01 1 4.701625
## gdp.dflt 1.291944e+00 1 1.136637
## gcf 2.545309e+00 1 1.595402
## trade 1.150763e+01 1 3.392289
## gdp.pc 1.456248e+01 1 3.816082
## lfdi 2.160031e+00 1 1.469704
## lgdp.dflt 2.277128e+00 1 1.509015
## lgdp.pc 4.484112e+01 1 6.696351
##
## [[2]]
## GVIF Df GVIF^(1/(2*Df))
## country.code 8.290746e+08 167 1.063414
## year 6.008229e+00 1 2.451169
## income 1.118177e+02 3 2.194919
## edu.total 3.823352e+00 1 1.955339
## hlth 6.075413e+00 1 2.464835
## mil 4.300241e+00 1 2.073702
## fdi 2.480922e+00 1 1.575094
## lbr.part 3.674864e+00 1 1.916993
## unemp 3.887473e+00 1 1.971668
## pop.gwth.total 3.609086e+01 1 6.007567
## pop.gwth.rural 1.024303e+01 1 3.200473
## pop.gwth.urban 2.228684e+01 1 4.720894
## gdp.dflt 1.284140e+00 1 1.133199
## gcf 2.531994e+00 1 1.591224
## trade 1.048324e+01 1 3.237784
## gdp.pc 1.479989e+01 1 3.847062
## lfdi 2.106014e+00 1 1.451211
## lgdp.dflt 2.257935e+00 1 1.502643
## lgdp.pc 4.402185e+01 1 6.634897
##
## [[3]]
## GVIF Df GVIF^(1/(2*Df))
## country.code 1.087251e+09 167 1.064277
## year 5.845090e+00 1 2.417662
## income 1.092703e+02 3 2.186504
## edu.total 2.956054e+00 1 1.719318
## hlth 4.993071e+00 1 2.234518
## mil 4.274817e+00 1 2.067563
## fdi 2.370888e+00 1 1.539769
## lbr.part 3.299163e+00 1 1.816360
## unemp 4.217429e+00 1 2.053638
## pop.gwth.total 3.652383e+01 1 6.043495
## pop.gwth.rural 1.026343e+01 1 3.203659
## pop.gwth.urban 2.241083e+01 1 4.734007
## gdp.dflt 1.304400e+00 1 1.142103
## gcf 2.658514e+00 1 1.630495
## trade 1.057181e+01 1 3.251432
## gdp.pc 1.430203e+01 1 3.781803
## lfdi 7.525756e+00 1 2.743311
## lgdp.dflt 2.261050e+00 1 1.503679
## lgdp.pc 4.671259e+01 1 6.834661
Some forum suggested that we should use the standard \(GVIF^{1/(2 \cdot df)} < 2\) as equivalent to \(GVIF < 4\) to account for high degree of freedom of some variables.
# adjusted gvif higher than 3
lapply(gvif, function(g) {
g[g[, 3] > 3, 3]
})
## [[1]]
## pop.gwth.total pop.gwth.rural pop.gwth.urban trade gdp.pc
## 5.964255 3.197977 4.701625 3.392289 3.816082
## lgdp.pc
## 6.696351
##
## [[2]]
## pop.gwth.total pop.gwth.rural pop.gwth.urban trade gdp.pc
## 6.007567 3.200473 4.720894 3.237784 3.847062
## lgdp.pc
## 6.634897
##
## [[3]]
## pop.gwth.total pop.gwth.rural pop.gwth.urban trade gdp.pc
## 6.043495 3.203659 4.734007 3.251432 3.781803
## lgdp.pc
## 6.834661
Remove the highest-ranked variables (pop.gwth.total,
pop.gwth.urban, pop.gwth.rural,
gdp.pc, lgdp.pc) and rebuild the models.
formula.str <- "pov ~ country.code+year+income+edu.total+hlth+mil+fdi+lbr.part+unemp+gdp.dflt+gcf+trade+lfdi+lgdp.dflt"
# Using single imputed data set
fit1 <- lapply(1:countries1.imputed$m, function(i) lm(as.formula(formula.str),
data = complete(countries1.imputed, i)))
Re-run GVIF analysis
gvif <- lapply(fit1, vif)
lapply(gvif, function(g) {
g[g[, 3] > 2, 3]
})
## [[1]]
## income hlth mil trade
## 2.036322 2.367399 2.032975 3.244918
##
## [[2]]
## income hlth mil trade
## 2.054620 2.439962 2.046788 3.108168
##
## [[3]]
## income hlth mil unemp trade lfdi
## 2.048709 2.221443 2.039804 2.022614 3.134994 2.633654
There’re still some variable with adjusted GVIF > 2. We are
interested in the effects of hlth (expenditure in
Healthcare), and mil (expenditure in Military), so we won’t
remove those variables. We can remove trade.
# remove trade
formula.str <- "pov ~ country.code+year+income+edu.total+hlth+mil+fdi+lbr.part+unemp+gdp.dflt+gcf+lfdi+lgdp.dflt"
# Using single imputed data set
fit1 <- lapply(1:countries1.imputed$m, function(i) lm(as.formula(formula.str),
data = complete(countries1.imputed, i)))
# gvif
gvif <- lapply(fit1, vif)
lapply(gvif, function(g) {
g[g[, 3] > 2, 3]
})
## [[1]]
## income hlth mil
## 2.028514 2.360405 2.029321
##
## [[2]]
## income hlth mil
## 2.049359 2.435536 2.046785
##
## [[3]]
## income hlth mil unemp lfdi
## 2.044805 2.212026 2.039383 2.014668 2.633095
The adjusted GVIF are acceptably low.
Coefficients overview.
# find coef of variable other than country.code
var <- c("year", "incomeL", "incomeLM", "incomeUM",
"edu.total", "hlth", "mil", "fdi", "lbr.part",
"unemp", "gdp.dflt", "gcf", "lfdi", "lgdp.dflt")
res <- do.call(rbind, lapply(fit1, function(f) f$coefficients[var]))
kable(res)
| year | incomeL | incomeLM | incomeUM | edu.total | hlth | mil | fdi | lbr.part | unemp | gdp.dflt | gcf | lfdi | lgdp.dflt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| -0.3485606 | 10.595420 | 0.0844078 | -2.457466 | -0.6609645 | 0.3284486 | -0.3528113 | 0 | 0.2797132 | -0.0862144 | 0.0021653 | -0.2595239 | -0.1608419 | 0.0866443 |
| -0.3600984 | 8.758320 | -0.3949656 | -2.496140 | -0.8036192 | 0.5609850 | -0.3605085 | 0 | 0.2612510 | -0.0554314 | 0.0020200 | -0.2913577 | -0.1278014 | 0.1578096 |
| -0.3216171 | 9.581061 | -0.5076219 | -2.526680 | -0.8734249 | 0.3265296 | -0.3955441 | 0 | 0.1815632 | -0.0654028 | 0.0010086 | -0.2423669 | -0.3676597 | 0.0922797 |
There are several sign-switching in income coefficients,
indicating the effect of removing some multicollinearity.
fit1.summ <- lapply(fit1, function(f) {
k <- length(f$coefficients) - 1
pred <- predict(f, countries.test.4)
data.frame(`Train Adj.R2` = summary(f)$adj.r.squared,
`Test Adj.R2` = adjR2(pred, countries.test.4$pov,
k))
})
res <- do.call(rbind.data.frame, fit1.summ)
res <- cbind(data.frame(Set = 1:3), res)
kable(res)
| Set | Train.Adj.R2 | Test.Adj.R2 |
|---|---|---|
| 1 | 0.8897643 | 0.6328398 |
| 2 | 0.8891405 | 0.6453508 |
| 3 | 0.8862279 | 0.6449837 |
Directly removing variables seems to be penalising our test results. As our interest is to investigate the effect of certain variables on poverty, the yielded R-Squared is within acceptable range. We can continue variable selection on the new models.
formula.str <- "pov ~ country.code+year+income+edu.total+hlth+mil+fdi+lbr.part+unemp+gdp.dflt+gcf+lfdi+lgdp.dflt"
We can conduct a step-wise AIC variable selection. It is similar to the procedure we use in class, but based on a metric call AIC (Akaike Information Criterion), which is an estimator of prediction error and relative quality of statistical models. The lower AIC is, the better the model fits. \
# helper
fitAIC <- function(i) {
set.seed(1984)
fit <- lm(as.formula(formula.str), complete(countries1.imputed,
i))
aic.fit <- stepAIC(fit, trace = T, direction = "backward")
return(aic.fit)
}
# build models
aic.fits <- lapply(1:countries1.imputed$m, fitAIC)
## Start: AIC=6666.13
## pov ~ country.code + year + income + edu.total + hlth + mil +
## fdi + lbr.part + unemp + gdp.dflt + gcf + lfdi + lgdp.dflt
##
## Df Sum of Sq RSS AIC
## - fdi 1 3 56314 6664.2
## - lgdp.dflt 1 15 56326 6664.6
## <none> 56311 6666.1
## - mil 1 101 56412 6667.4
## - gdp.dflt 1 110 56421 6667.7
## - unemp 1 127 56438 6668.3
## - hlth 1 225 56536 6671.5
## - lfdi 1 275 56586 6673.1
## - edu.total 1 660 56971 6685.6
## - gcf 1 2671 58982 6749.5
## - lbr.part 1 3320 59631 6769.7
## - income 3 7420 63731 6888.2
## - year 1 7636 63947 6898.5
## - country.code 167 133562 189873 8572.2
##
## Step: AIC=6664.23
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gdp.dflt + gcf + lfdi + lgdp.dflt
##
## Df Sum of Sq RSS AIC
## - lgdp.dflt 1 15 56329 6662.7
## <none> 56314 6664.2
## - mil 1 102 56416 6665.5
## - gdp.dflt 1 110 56424 6665.8
## - unemp 1 128 56442 6666.4
## - hlth 1 232 56546 6669.8
## - lfdi 1 284 56599 6671.5
## - edu.total 1 661 56975 6683.7
## - gcf 1 2672 58986 6747.7
## - lbr.part 1 3323 59637 6767.9
## - income 3 7452 63766 6887.3
## - year 1 7664 63978 6897.4
## - country.code 167 133766 190080 8572.2
##
## Step: AIC=6662.72
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gdp.dflt + gcf + lfdi
##
## Df Sum of Sq RSS AIC
## <none> 56329 6662.7
## - mil 1 103 56432 6664.1
## - gdp.dflt 1 139 56468 6665.3
## - unemp 1 151 56480 6665.6
## - hlth 1 227 56556 6668.1
## - lfdi 1 286 56616 6670.1
## - edu.total 1 685 57014 6683.0
## - gcf 1 2658 58987 6745.7
## - lbr.part 1 3321 59650 6766.3
## - income 3 7481 63810 6886.5
## - year 1 8040 64369 6906.6
## - country.code 167 133751 190080 8570.2
## Start: AIC=6676.53
## pov ~ country.code + year + income + edu.total + hlth + mil +
## fdi + lbr.part + unemp + gdp.dflt + gcf + lfdi + lgdp.dflt
##
## Df Sum of Sq RSS AIC
## - fdi 1 0 56630 6674.5
## - unemp 1 48 56678 6676.1
## - lgdp.dflt 1 51 56681 6676.2
## <none> 56630 6676.5
## - gdp.dflt 1 96 56726 6677.7
## - mil 1 103 56733 6677.9
## - lfdi 1 177 56807 6680.3
## - hlth 1 626 57255 6694.8
## - edu.total 1 814 57444 6700.8
## - lbr.part 1 2919 59549 6767.2
## - gcf 1 3388 60018 6781.6
## - income 3 5751 62380 6848.8
## - year 1 8028 64657 6918.9
## - country.code 167 133970 190600 8579.3
##
## Step: AIC=6674.53
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gdp.dflt + gcf + lfdi + lgdp.dflt
##
## Df Sum of Sq RSS AIC
## - unemp 1 48 56678 6674.1
## - lgdp.dflt 1 51 56681 6674.2
## <none> 56630 6674.5
## - gdp.dflt 1 96 56726 6675.7
## - mil 1 104 56733 6675.9
## - lfdi 1 194 56823 6678.8
## - hlth 1 630 57260 6692.9
## - edu.total 1 816 57445 6698.9
## - lbr.part 1 2920 59550 6765.2
## - gcf 1 3389 60019 6779.7
## - income 3 5762 62391 6847.1
## - year 1 8088 64718 6918.6
## - country.code 167 134109 190739 8578.6
##
## Step: AIC=6674.1
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + gdp.dflt + gcf + lfdi + lgdp.dflt
##
## Df Sum of Sq RSS AIC
## <none> 56678 6674.1
## - lgdp.dflt 1 75 56753 6674.5
## - gdp.dflt 1 93 56771 6675.1
## - mil 1 99 56777 6675.3
## - lfdi 1 199 56877 6678.6
## - hlth 1 593 57271 6691.3
## - edu.total 1 815 57493 6698.4
## - lbr.part 1 3111 59789 6770.6
## - gcf 1 3357 60034 6778.1
## - income 3 5770 62448 6846.8
## - year 1 8048 64726 6916.8
## - country.code 167 134415 191093 8580.0
## Start: AIC=6724.32
## pov ~ country.code + year + income + edu.total + hlth + mil +
## fdi + lbr.part + unemp + gdp.dflt + gcf + lfdi + lgdp.dflt
##
## Df Sum of Sq RSS AIC
## - fdi 1 1 58119 6722.4
## - lgdp.dflt 1 17 58135 6722.9
## - gdp.dflt 1 24 58141 6723.1
## - unemp 1 60 58178 6724.2
## <none> 58117 6724.3
## - mil 1 126 58243 6726.3
## - lfdi 1 226 58344 6729.5
## - hlth 1 261 58378 6730.6
## - edu.total 1 1233 59351 6761.0
## - lbr.part 1 1586 59703 6771.9
## - gcf 1 2226 60343 6791.6
## - year 1 5283 63400 6882.7
## - income 3 6818 64935 6922.8
## - country.code 167 137736 195854 8629.4
##
## Step: AIC=6722.36
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gdp.dflt + gcf + lfdi + lgdp.dflt
##
## Df Sum of Sq RSS AIC
## - lgdp.dflt 1 17 58136 6720.9
## - gdp.dflt 1 24 58142 6721.1
## - unemp 1 60 58178 6722.3
## <none> 58119 6722.4
## - mil 1 125 58244 6724.3
## - lfdi 1 245 58363 6728.1
## - hlth 1 260 58378 6728.6
## - edu.total 1 1233 59351 6759.0
## - lbr.part 1 1585 59704 6770.0
## - gcf 1 2225 60343 6789.6
## - year 1 5287 63405 6880.8
## - income 3 6824 64943 6921.0
## - country.code 167 137971 196090 8629.6
##
## Step: AIC=6720.92
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gdp.dflt + gcf + lfdi
##
## Df Sum of Sq RSS AIC
## - gdp.dflt 1 37 58173 6720.1
## <none> 58136 6720.9
## - unemp 1 79 58215 6721.4
## - mil 1 127 58263 6722.9
## - lfdi 1 248 58384 6726.7
## - hlth 1 257 58394 6727.1
## - edu.total 1 1259 59395 6758.4
## - lbr.part 1 1578 59714 6768.3
## - gcf 1 2208 60344 6787.6
## - year 1 5516 63652 6886.0
## - income 3 6844 64980 6920.0
## - country.code 167 137963 196099 8627.7
##
## Step: AIC=6720.09
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gcf + lfdi
##
## Df Sum of Sq RSS AIC
## <none> 58173 6720.1
## - unemp 1 79 58252 6720.6
## - mil 1 134 58307 6722.3
## - hlth 1 263 58436 6726.4
## - lfdi 1 271 58444 6726.6
## - edu.total 1 1334 59507 6759.9
## - lbr.part 1 1559 59733 6766.8
## - gcf 1 2174 60347 6785.7
## - year 1 5600 63773 6887.5
## - income 3 6814 64987 6918.2
## - country.code 167 138013 196186 8626.5
# check final terms of each model
lapply(aic.fits, function(mod) formula(mod$terms))
## [[1]]
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gdp.dflt + gcf + lfdi
## <environment: 0x56470fa3a7f0>
##
## [[2]]
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + gdp.dflt + gcf + lfdi + lgdp.dflt
## <environment: 0x56470c707030>
##
## [[3]]
## pov ~ country.code + year + income + edu.total + hlth + mil +
## lbr.part + unemp + gcf + lfdi
## <environment: 0x56471a121b00>
The most commonly removed variables are:
gdp.dfltldgp.dfltunempfdi# predict with each model
aic.pred <- lapply(aic.fits, predict, newdata = countries.test.4)
# calculate adj r2
aic.train.r2 <- unlist(lapply(aic.fits, function(model) summary(model)$adj.r.squared))
k <- lapply(aic.fits, function(m) length(m$coefficients))
aic.test.r2 <- unlist(mapply(adjR2, aic.pred, k = k,
MoreArgs = list(orig = countries.test.4$pov)))
res <- data.frame(`set no.` = 1:3, train = aic.train.r2,
test = aic.test.r2)
We can compare Adjusted R-Squared to estimate relative over-fitting in the new models.
kable(res)
| set.no. | train | test |
|---|---|---|
| 1 | 0.8898615 | 0.6331120 |
| 2 | 0.8891796 | 0.6480423 |
| 3 | 0.8863243 | 0.6467902 |
Performance on train set and test set are not affected. However, we have simplified the model by removing some unnecessary terms. We can update our models.
formula.str <- "pov ~ country.code+year+income+edu.total+hlth+mil+lbr.part+gcf+lfdi"
fit3 <- lapply(1:countries1.imputed$m, function(i) lm(as.formula(formula.str),
data = complete(countries1.imputed, i)))
# Build models with all generated data set and
# pool the estimates
fit4 <- with(data = countries1.imputed, exp = lm(as.formula(formula.str)))
fit4.combined <- pool(fit4)
# dummy lm model
fit4.dummy <- lm(as.formula(formula.str), data = complete(countries1.imputed,
1))
# replace coefficients of dummy model & predict
fit4.dummy$coefficients <- fit4.combined$pooled$estimate
fit3.summ <- lapply(fit3, function(f) {
k <- length(f$coefficients) - 1
pred <- predict(f, countries.test.4)
resid <- pred - countries.test.4$pov
data.frame(`Train MSE` = mean(summary(f)$residuals^2),
`Test MSE` = mean(resid^2), `Train MAE` = mean(abs(summary(f)$residuals)),
`Test MAE` = mean(abs(test.resid)), `Train Adj.R2` = summary(f)$adj.r.squared,
`Test Adj.R2` = adjR2(pred, countries.test.4$pov,
k))
})
k <- length(fit4.dummy$coefficients) - 1
fit4.pred <- predict(fit4.dummy, countries.test.4)
resid <- fit4.pred - countries.test.4$pov
fit4.summ <- data.frame(`Train MSE` = mean(summary(fit4.dummy)$residuals^2),
`Test MSE` = mean(resid^2), `Train MAE` = mean(abs(summary(fit4.dummy)$residuals)),
`Test MAE` = mean(abs(test.resid)), `Train Adj.R2` = pool.r.squared(fit4,
adjusted = T)[, "est"], `Test Adj.R2` = adjR2(fit4.pred,
countries.test.4$pov, k))
res <- do.call(rbind.data.frame, fit3.summ)
res <- rbind(res, fit4.summ)
res <- cbind(data.frame(Set = c(1:3, "Pooled")), res)
kable(res)
| Set | Train.MSE | Test.MSE | Train.MAE | Test.MAE | Train.Adj.R2 | Test.Adj.R2 |
|---|---|---|---|---|---|---|
| 1 | 30.72395 | 14.84489 | 3.443985 | 2.08833 | 0.8894174 | 0.6398425 |
| 2 | 30.87341 | 14.39591 | 3.432739 | 2.08833 | 0.8888794 | 0.6507353 |
| 3 | 31.60740 | 14.42964 | 3.430773 | 2.08833 | 0.8862376 | 0.6499170 |
| Pooled | 30.72395 | 14.47949 | 3.443985 | 2.08833 | 0.8881862 | 0.6487075 |
Coefficients overview.
var <- c("year", "incomeL", "incomeLM", "incomeUM",
"edu.total", "hlth", "mil", "lbr.part", "gcf",
"lfdi")
res <- do.call(rbind, lapply(fit3, function(f) f$coefficients[var]))
kable(res)
| year | incomeL | incomeLM | incomeUM | edu.total | hlth | mil | lbr.part | gcf | lfdi |
|---|---|---|---|---|---|---|---|---|---|
| -0.3508013 | 10.371014 | -0.1281986 | -2.599338 | -0.7296054 | 0.2851357 | -0.3604509 | 0.2846212 | -0.2393414 | -0.1664479 |
| -0.3682635 | 8.556595 | -0.5711252 | -2.624260 | -0.8530796 | 0.5317054 | -0.3694068 | 0.2625263 | -0.2744927 | -0.1358155 |
| -0.3250076 | 9.407023 | -0.6753246 | -2.632387 | -0.9032975 | 0.3074890 | -0.4053229 | 0.1833561 | -0.2258514 | -0.3926658 |
We can further treat over-fitting with regularization.
Helper functions to standardise data.
sd0 <- function(vct) {
if (!is.numeric(vct)) {
return(NA)
}
return(sd(vct, na.rm = T))
}
std0 <- function(vct, scl) {
if (!is.numeric(vct)) {
return(vct)
}
return(vct/scl)
}
standardise <- function(train, test) {
# save number of train and combine both data
# set
trainCases <- 1:nrow(train)
data <- rbind(train, test)
# calc scaler
scaler <- unlist(lapply(data, sd0))
# get the numeric columns
numCols <- which(unlist(lapply(data, is.numeric)))
# divide numeric columns by scalers, and
# combine with factor columns
num <- as.data.frame(mapply(std0, vct = data[,
numCols], scl = scaler[numCols], SIMPLIFY = T))
fct <- data[, -numCols]
final <- cbind(fct, num)
# split to train and test
trainSet <- final[trainCases, ]
testSet <- final[-trainCases, ]
res <- list(final = final, train = trainSet, test = testSet,
scaler = scaler)
return(res)
}
Helper function to build the optimal model.
optimalModel <- function(formula, train, test) {
set.seed(1984)
# standise and save number of train cases (to
# split later)
std.data <- standardise(train, test)
trainCases <- 1:nrow(train)
# matrix-ise
xs <- model.matrix(formula, std.data$final)[, -1]
ys <- std.data$final$pov
# split data
train.x <- xs[trainCases, ]
train.y <- ys[trainCases]
test.x <- xs[-trainCases, ]
test.y <- ys[-trainCases]
# list of tested alpha, resolution = 0.1
alphas <- seq(0, 1, 0.1)
# build a bunch of models
models <- lapply(alphas, function(a) cv.glmnet(train.x,
train.y, type.measure = "mse", alpha = a))
cv.error <- unlist(lapply(models, function(model) model$cvm[model$lambda ==
model$lambda.min]))
# best model
best.model.idx <- which.min(cv.error)
# optimal alpha and lambda
alpha.opt <- alphas[best.model.idx]
lambda.opt <- models[[best.model.idx]]$lambda.min
best.model <- glmnet(train.x, train.y, alpha = alpha.opt,
lambda = lambda.opt)
# predict for test data
train.fit <- predict(best.model, train.x)
test.fit <- predict(best.model, test.x)
# evaluation - train
k <- best.model$df
train.r2 <- best.model$dev.ratio
train.adjR2 <- adjR2(train.fit, train.y, k)
train.resid <- train.fit - train.y
train.mse <- mean(train.resid^2)
train.rmse <- sqrt(train.mse)
train.mae <- mean(abs(train.resid))
train.mape <- mean(abs(train.resid/train.y))
# evaluation - test
test.r2 <- r2(test.fit, test.y)
test.adjR2 <- adjR2(test.fit, test.y, k)
test.resid <- test.fit - test.y
test.mse <- mean(test.resid^2)
test.rmse <- sqrt(test.mse)
test.mae <- mean(abs(test.resid))
test.mape <- mean(abs(test.resid/test.y))
# final results
results <- list(train = list(data = train, r2 = train.r2,
adj.r2 = train.adjR2, residuals = train.resid,
mse = train.mse, rmse = train.rmse, mae = train.mae,
mape = train.mape), test = list(data = test,
r2 = test.r2, adj.r2 = test.adjR2, residuals = test.resid,
mse = test.mse, rmse = test.rmse, mae = test.mae,
mape = test.mape), scaler = std.data$scaler,
alpha = alpha.opt, lambda = lambda.opt, model = best.model)
return(results)
}
Build the optimal models using each imputated data set.
# helper function to build from a specific
# imputated data
buildWith <- function(set, imputed, test, formula,
fun) {
train <- complete(imputed, set)
model <- fun(formula, train, test)
return(model)
}
set.seed(1984)
models <- lapply(1:countries1.imputed$m, buildWith,
imputed = countries1.imputed, test = countries.test.4,
formula = as.formula(formula.str), fun = optimalModel)
Display parameters and evaluation.
result.list <- lapply(models, function(mod) {
data.frame(alpha = mod$alpha, lambda = mod$lambda,
`Train MSE` = mod$train$mse, `Test MSE` = mod$test$mse,
`Train MAE` = mod$train$mae, `Test MAE` = mod$test$mae,
`Train R2` = mod$train$r2, `Test R2` = mod$test$r2,
`Train Adj.R2` = mod$train$adj.r2, `Test Adj.R2` = mod$test$adj.r2)
})
result.df <- do.call(rbind.data.frame, result.list)
result.df <- cbind(data.frame(Set = 1:countries1.imputed$m),
result.df)
kable(result.df)
| Set | alpha | lambda | Train.MSE | Test.MSE | Train.MAE | Test.MAE | Train.R2 | Test.R2 | Train.Adj.R2 | Test.Adj.R2 |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.5 | 0.0001625 | 0.1200250 | 0.0576166 | 0.2146240 | 0.1451234 | 0.8995019 | 0.7558061 | 0.8888183 | 0.6402383 |
| 2 | 0.9 | 0.0000900 | 0.1205717 | 0.0558694 | 0.2135989 | 0.1445973 | 0.8990441 | 0.7632111 | 0.8883119 | 0.6511480 |
| 3 | 0.5 | 0.0001782 | 0.1234911 | 0.0560536 | 0.2136489 | 0.1414735 | 0.8965997 | 0.7624304 | 0.8856076 | 0.6499977 |
There are some improvement on the test data performance based on adjusted R-Squared. MSE and MAE remain low on both data sets.
Coefficients overview.
var <- c("year", "incomeL", "incomeLM", "incomeUM",
"edu.total", "hlth", "mil", "lbr.part", "gcf",
"lfdi")
res <- do.call(rbind, lapply(models, function(f) coefficients(f$model)[var,
]))
kable(res)
| year | incomeL | incomeLM | incomeUM | edu.total | hlth | mil | lbr.part | gcf | lfdi |
|---|---|---|---|---|---|---|---|---|---|
| -0.1823323 | 0.6786852 | 0.0123409 | -0.1535162 | -0.0725608 | 0.0343084 | -0.0223762 | 0.1487238 | -0.1034815 | -0.0357769 |
| -0.1917452 | 0.5708028 | -0.0104157 | -0.1533967 | -0.0842415 | 0.0733698 | -0.0231306 | 0.1439861 | -0.1183924 | -0.0288423 |
| -0.1684479 | 0.6259938 | -0.0162978 | -0.1530337 | -0.0890159 | 0.0403272 | -0.0258454 | 0.0994393 | -0.0983796 | -0.0600141 |
All non-country.code variables are non-zero. We can observe that the
regularisation models just remove the effect of
country.code, for countries that have similar
“baseline”.
Check assumption outliers #### 3.2.4.3. Interpretation